From: Brian Wolff Date: Mon, 12 Aug 2013 16:18:29 +0000 (-0300) Subject: Add "extended" file metadata to API X-Git-Tag: 1.31.0-rc.0~18354^2 X-Git-Url: http://git.cyclocoop.org//%22http:/%22.attribut_html%28%24lesurls%5B%24numero%5D%29.%22/%22?a=commitdiff_plain;h=d14178139662d9d1303cd7d40750bed10aab5385;p=lhc%2Fweb%2Fwiklou.git Add "extended" file metadata to API Part of the point of this, is to add a hook to allow extensions to add their own metadata (I intend to create a companion extension to parse Commons description pages - I5e6bc45f9751) It's hoped that this would provide a simple system to get file metadata, and would be able to return information on any wiki (even without any extensions installed to provide additional information). So it could fallback to exif data, if there's no better source of information for the file available. It's also meant to be done in such a way that, in the future, when Wikibase is deployed on Commons, it could be integrated without too much fuss. marktraceur changed this patch to be a little less heavy-handed: it will now return unformatted data by default from the API, but there's also a method for formatting that data in the way that bawolff originally intended. I'm still trying to figure out if there's much use in that method, but for now it's not particularly useful. Change-Id: I77303d8e535fc1c42e14cfb853814e5c434a81ec --- diff --git a/docs/hooks.txt b/docs/hooks.txt index 2d1001ba52..d18679b300 100644 --- a/docs/hooks.txt +++ b/docs/hooks.txt @@ -1157,6 +1157,13 @@ $title: Title object that we need to get a sortkey for underscore) magic words. Called by MagicWord. &$doubleUnderscoreIDs: array of strings +'GetExtendedMetadata': Get extended file metadata for the API +&$combinedMeta: Array of the form: 'MetadataPropName' => array( +'value' => prop value, 'source' => 'name of hook' ). +$file: File object of file in question +$context: RequestContext (including language to use) +&$maxCacheTime: how long the results can be cached + 'GetFullURL': Modify fully-qualified URLs used in redirects/export/offsite data. $title: Title object of page $url: string value as output (out parameter, can modify) @@ -2738,6 +2745,12 @@ $userId: User id of the current user $userText: User name of the current user &$items: Array of user tool links as HTML fragments +'ValidateExtendedMetadataCache': Called to validate the cached metadata in +FormatMetadata::getExtendedMeta (return false means cache will be +invalidated and GetExtendedMetadata hook called again). +$timestamp: The timestamp metadata was generated +$file: The file the metadata is for + 'WantedPages::getQueryInfo': Called in WantedPagesPage::getQueryInfo(), can be used to alter the SQL query which gets the list of wanted pages. &$wantedPages: WantedPagesPage object diff --git a/includes/api/ApiQueryImageInfo.php b/includes/api/ApiQueryImageInfo.php old mode 100644 new mode 100755 index 0ea286847b..1e80f57120 --- a/includes/api/ApiQueryImageInfo.php +++ b/includes/api/ApiQueryImageInfo.php @@ -49,6 +49,10 @@ class ApiQueryImageInfo extends ApiQueryBase { $scale = $this->getScale( $params ); + $metadataOpts = array( + 'version' => $params['metadataversion'], + ); + $pageIds = $this->getPageSet()->getAllTitlesByNamespace(); if ( !empty( $pageIds[NS_FILE] ) ) { $titles = array_keys( $pageIds[NS_FILE] ); @@ -146,7 +150,9 @@ class ApiQueryImageInfo extends ApiQueryBase { $fit = $this->addPageSubItem( $pageId, self::getInfo( $img, $prop, $result, - $finalThumbParams, $params['metadataversion'] ) ); + $finalThumbParams, $metadataOpts + ) + ); if ( !$fit ) { if ( count( $pageIds[NS_FILE] ) == 1 ) { // See the 'the user is screwed' comment above @@ -178,7 +184,7 @@ class ApiQueryImageInfo extends ApiQueryBase { $fit = self::getTransformCount() < self::TRANSFORM_LIMIT && $this->addPageSubItem( $pageId, self::getInfo( $oldie, $prop, $result, - $finalThumbParams, $params['metadataversion'] + $finalThumbParams, $metadataOpts ) ); if ( !$fit ) { @@ -296,10 +302,18 @@ class ApiQueryImageInfo extends ApiQueryBase { * @param array $prop of properties to get (in the keys) * @param $result ApiResult object * @param array $thumbParams containing 'width' and 'height' items, or null - * @param string $version Version of image metadata (for things like jpeg which have different versions). + * @param string|array $metadataOpts Options for metadata fetching. + * This is an array consisting of the keys: + * 'version': The metadata version for the metadata option * @return Array: result array */ - static function getInfo( $file, $prop, $result, $thumbParams = null, $version = 'latest' ) { + static function getInfo( $file, $prop, $result, $thumbParams = null, $metadataOpts = false ) { + if ( !$metadataOpts || is_string( $metadataOpts ) ) { + $metadataOpts = array( + 'version' => $metadataOpts ?: 'latest', + ); + } + $version = $metadataOpts['version']; $vals = array(); // Timestamp is shown even if the file is revdelete'd in interface // so do same here. @@ -359,6 +373,7 @@ class ApiQueryImageInfo extends ApiQueryBase { $url = isset( $prop['url'] ); $sha1 = isset( $prop['sha1'] ); $meta = isset( $prop['metadata'] ); + $extmetadata = isset( $prop['extmetadata'] ); $mime = isset( $prop['mime'] ); $mediatype = isset( $prop['mediatype'] ); $archive = isset( $prop['archivename'] ); @@ -417,6 +432,15 @@ class ApiQueryImageInfo extends ApiQueryBase { $vals['metadata'] = $metadata ? self::processMetaData( $metadata, $result ) : null; } + if ( $extmetadata ) { + // Note, this should return an array where all the keys + // start with a letter, and all the values are strings. + // Thus there should be no issue with format=xml. + $format = new FormatMetadata; + $extmetaArray = $format->fetchExtendedMetadata( $file ); + $vals['extmetadata'] = $extmetaArray; + } + if ( $mime ) { $vals['mime'] = $file->getMimeType(); } @@ -564,6 +588,7 @@ class ApiQueryImageInfo extends ApiQueryBase { ' (requires url and param ' . $modulePrefix . 'urlwidth)', 'mediatype' => ' mediatype - Adds the media type of the image', 'metadata' => ' metadata - Lists Exif metadata for the version of the image', + 'extmetadata' => ' extmetadata - Lists formatted metadata combined from multiple sources. Results are HTML formatted.', 'archivename' => ' archivename - Adds the file name of the archive version for non-latest versions', 'bitdepth' => ' bitdepth - Adds the bit depth of the version', 'uploadwarning' => ' uploadwarning - Used by the Special:Upload page to get information about an existing file. Not intended for use outside MediaWiki core', diff --git a/includes/filerepo/file/ForeignAPIFile.php b/includes/filerepo/file/ForeignAPIFile.php old mode 100644 new mode 100755 index ed96d446d5..a1a6f2691c --- a/includes/filerepo/file/ForeignAPIFile.php +++ b/includes/filerepo/file/ForeignAPIFile.php @@ -169,6 +169,16 @@ class ForeignAPIFile extends File { return null; } + /** + * @return array|null extended metadata (see imageinfo API for format) or null on error + */ + public function getExtendedMetadata() { + if ( isset( $this->mInfo['extmetadata'] ) ) { + return $this->mInfo['extmetadata']; + } + return null; + } + /** * @param $metadata array * @return array diff --git a/includes/media/FormatMetadata.php b/includes/media/FormatMetadata.php old mode 100644 new mode 100755 index 70d76dda66..17de74ab4a --- a/includes/media/FormatMetadata.php +++ b/includes/media/FormatMetadata.php @@ -1416,6 +1416,233 @@ class FormatMetadata extends ContextSource { $tel )->text(); } } + + /** + * Get a list of fields that are visible by default. + * + * @return array + * @since 1.23 + */ + public static function getVisibleFields() { + $fields = array(); + $lines = explode( "\n", wfMessage( 'metadata-fields' )->inContentLanguage()->text() ); + foreach ( $lines as $line ) { + $matches = array(); + if ( preg_match( '/^\\*\s*(.*?)\s*$/', $line, $matches ) ) { + $fields[] = $matches[1]; + } + } + $fields = array_map( 'strtolower', $fields ); + return $fields; + } + + /** + * Get an array of extended metadata. (See the imageinfo API for format.) + * + * @param File $file File to use + * @return array [ => ['value' => ]], or [] on error + * @since 1.23 + */ + public function fetchExtendedMetadata( File $file ) { + global $wgMemc; + + wfProfileIn( __METHOD__ ); + + // If revision deleted, exit immediately + if ( $file->isDeleted( File::DELETED_FILE ) ) { + return array(); + } + + $cacheKey = wfMemcKey( + 'getExtendedMetadata', + $this->getLanguage()->getCode(), + $file->getSha1() + ); + + $cachedValue = $wgMemc->get( $cacheKey ); + if ( + $cachedValue + && wfRunHooks( 'ValidateExtendedMetadataCache', array( $cachedValue['timestamp'], $file ) ) + ) { + $extendedMetadata = $cachedValue['data']; + } else { + $maxCacheTime = ( $file instanceof ForeignAPIFile ) ? 60 * 60 * 12 : 60 * 60 * 24 * 30; + $fileMetadata = $this->getExtendedMetadataFromFile( $file ); + $extendedMetadata = $this->getExtendedMetadataFromHook( $file, $fileMetadata, $maxCacheTime ); + // Make sure the metadata won't break the API when an XML format is used. + // This is an API-specific function so it would be cleaner to call it from + // outside fetchExtendedMetadata, but this way we don't need to redo the + // computation on a cache hit. + $this->sanitizeArrayForXml($extendedMetadata); + $valueToCache = array( 'data' => $extendedMetadata, 'timestamp' => wfTimestampNow() ); + $wgMemc->set( $cacheKey, $valueToCache, $maxCacheTime ); + } + + wfProfileOut( __METHOD__ ); + return $extendedMetadata; + } + + /** + * Get file-based metadata in standardized format. + * + * Note that for a remote file, this might return metadata supplied by extensions. + * + * @param File $file File to use + * @return array [ => ['value' => ]], or [] on error + * @since 1.23 + */ + protected function getExtendedMetadataFromFile( File $file ) { + // If this is a remote file accessed via an API request, we already + // have remote metadata so we just ignore any local one + if ( $file instanceof ForeignAPIFile ) { + // in case of error we pretend no metadata - this will get cached. Might or might not be a good idea. + return $file->getExtendedMetadata() ?: array(); + } + + wfProfileIn( __METHOD__ ); + + $uploadDate = wfTimestamp( TS_ISO_8601, $file->getTimestamp() ); + + $fileMetadata = array( + // This is modification time, which is close to "upload" time. + 'DateTime' => array( + 'value' => $uploadDate, + 'source' => 'mediawiki-metadata', + ), + ); + + $title = $file->getTitle(); + if ( $title ) { + $text = $title->getText(); + $pos = strrpos( $text, '.' ); + + if ( $pos ) { + $name = substr( $text, 0, $pos ); + } else { + $name = $text; + } + + $fileMetadata[ 'ObjectName' ] = array( + 'value' => $name, + 'source' => 'mediawiki-metadata', + ); + } + + $common = $file->getCommonMetaArray(); + + foreach ( $common as $key => $value ) { + $fileMetadata[$key] = array( + 'value' => $value, + 'source' => 'file-metadata', + ); + } + + wfProfileOut( __METHOD__ ); + return $fileMetadata; + } + + /** + * Get additional metadata from hooks in standardized format. + * + * @param File $file File to use + * @param array $extendedMetadata + * @param int $maxCacheTime hook handlers might use this parameter to override cache time + * + * @return array [ => ['value' => ]], or [] on error + * @since 1.23 + */ + protected function getExtendedMetadataFromHook( File $file, array $extendedMetadata, &$maxCacheTime ) { + wfProfileIn( __METHOD__ ); + + wfRunHooks( 'GetExtendedMetadata', array( + &$extendedMetadata, + $file, + $this->getContext(), + &$maxCacheTime + ) ); + + $visible = array_flip( self::getVisibleFields() ); + foreach ( $extendedMetadata as $key => $value ) { + if ( !isset( $visible[ strtolower( $key ) ] ) ) { + $extendedMetadata[$key]['hidden'] = ''; + } + } + + wfProfileOut( __METHOD__ ); + return $extendedMetadata; + } + + /** + * Makes sure the given array is a valid API response fragment + * (can be transformed into XML) + * @param array $arr + */ + protected function sanitizeArrayForXml( &$arr ) { + if ( !is_array( $arr ) ) { + return; + } + + $counter = 1; + foreach ( $arr as $key => &$value ) { + $sanitizedKey = $this->sanitizeKeyForXml( $key ); + if ( $sanitizedKey !== $key ) { + if ( isset( $arr[$sanitizedKey] ) ) { + // Make the sanitized keys hopefully unique. + // To make it definitely unique would be too much effort, given that + // sanitizing is only needed for misformatted metadata anyway, but + // this at least covers the case when $arr is numeric. + $sanitizedKey .= $counter; + ++$counter; + } + $arr[$sanitizedKey] = $arr[$key]; + unset( $arr[$key] ); + } + if ( is_array( $value ) ) { + $this->sanitizeArrayForXml( $value ); + } + } + } + + /** + * Turns a string into a valid XML identifier. + * Used to ensure that keys of an associative array in the + * API response do not break the XML formatter. + * @param string $key + * @return string + * @since 1.23 + */ + protected function sanitizeKeyForXml( $key ) { + // drop all characters which are not valid in an XML tag name + // a bunch of non-ASCII letters would be valid but probably won't + // be used so we take the easy way + $key = preg_replace( '/[^a-zA-z0-9_:.-]/', '', $key ); + // drop characters which are invalid at the first position + $key = preg_replace( '/^[\d-.]+/', '', $key ); + + if ( $key == '' ) { + $key = '_'; + } + + // special case for an internal keyword + if ( $key == '_element' ) { + $key = 'element'; + } + + return $key; + } + + /** + * Returns a list of languages (first is best) to use when formatting multilang fields, + * based on user and site preferences. + * @return array + * @since 1.23 + */ + protected function getPriorityLanguages() + { + $priorityLanguages = Language::getFallbacksIncludingSiteLanguage( $this->getLanguage()->getCode() ); + $priorityLanguages = array_merge( (array) $this->getLanguage()->getCode(), $priorityLanguages[0], $priorityLanguages[1] ); + return $priorityLanguages; + } } /** For compatability with old FormatExif class diff --git a/includes/media/MediaHandler.php b/includes/media/MediaHandler.php index fd5b21cb26..ddb8efd7d1 100755 --- a/includes/media/MediaHandler.php +++ b/includes/media/MediaHandler.php @@ -208,6 +208,10 @@ abstract class MediaHandler { * FormatMetadata::getFormattedData() on the full response array, which * transforms all values into prettified, human-readable text. * + * Subclasses overriding this function must return a value which is a + * valid API response fragment (all associative array keys are valid + * XML tagnames). + * * Note, if the file simply has no metadata, but the handler supports * this interface, it should return an empty array, not false. * @@ -469,16 +473,7 @@ abstract class MediaHandler { * @access protected */ function visibleMetadataFields() { - $fields = array(); - $lines = explode( "\n", wfMessage( 'metadata-fields' )->inContentLanguage()->text() ); - foreach ( $lines as $line ) { - $matches = array(); - if ( preg_match( '/^\\*\s*(.*?)\s*$/', $line, $matches ) ) { - $fields[] = $matches[1]; - } - } - $fields = array_map( 'strtolower', $fields ); - return $fields; + return FormatMetadata::getVisibleFields(); } /**